home *** CD-ROM | disk | FTP | other *** search
Wrap
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/io.c */ #include "glimpse.h" #include <stdio.h> #include <sys/stat.h> extern char INDEX_DIR[MAX_LINE_LEN]; extern int memory_usage; /* -------------------------------------------------------------------- get_array_of_lines() input: an input filename, address of the table, maximum number of entries of the table, and a overflow handling flag. output: a set of strings in the table. when overflow is ON, the function returns after the table is filled. otherwise the function will exit if overflow occurs. In normal return, the function returns the number of entries read. ----------------------------------------------------------------------*/ get_array_of_lines(inputfile, table, max_entry, overflow_ok) char *inputfile; unsigned char *table[]; int max_entry; /* max number of entries in the table */ int overflow_ok; /* flag for handling overflow */ { int tx=0; /* index for table */ FILE *file_in; unsigned char buffer[MAX_NAME_BUF]; char *np; int line_length; if((file_in = fopen(inputfile, "r")) == NULL) { if (overflow_ok) return 0; fprintf(stderr, "can't open %s for reading\n", inputfile); exit(3); } while(fgets(buffer, MAX_NAME_BUF, file_in)) { line_length = strlen(buffer); buffer[line_length-1] = '\0'; /* discard the '\n' */ #if BG_DEBUG np = (char *) my_malloc(sizeof(char) * (line_length + 2)); #else /*BG_DEBUG*/ np = (char *) my_malloc(sizeof(char) * (line_length + 2)); #endif /*BG_DEBUG*/ if(np == NULL) { int i=0; fclose(file_in); for (i=0; i<tx; i++) { #if BG_DEBUG memory_usage -= (strlen(table[i]) + 2); #endif /*BG_DEBUG*/ my_free(table[i], 0); } if (overflow_ok) { fclose(file_in); return 0; } fprintf(stderr, "malloc failure in get_array_of_lines\n"); exit(2); } table[tx++] = (unsigned char *)np; strcpy(np, buffer); if(tx > max_entry) { fclose(file_in); if(overflow_ok) { return(tx); } fprintf(stderr, "overflow in get_array_of_lines()\n"); exit(2); } } fclose(file_in); return(tx); /* return number of lines read */ } /* -------------------------------------------------------------------- get_table(): input: an input filename, address of the table, maximum number of entries of the table, and a overflow handling flag. output: a set of integers in the table. when overflow_ok is ON, the function returns after the table is filled. otherwise the function will exit if overflow occurs. In normal return, the function returns the number of entries read. ----------------------------------------------------------------------*/ int get_table(inputfile, table, max_entry, overflow_ok) char *inputfile; int table[]; int max_entry; int overflow_ok; { int val = 0; int c = 0; FILE *file_in; int tx=0; /* number of entries read */ if((file_in = fopen(inputfile, "r")) == NULL) { if (overflow_ok) return 0; fprintf(stderr, "can't open %s for reading\n", inputfile); exit(2); } while((c = getc(file_in)) != EOF) { val = c << 24; if ((c = getc(file_in)) == EOF) break; val |= c << 16; if ((c = getc(file_in)) == EOF) break; val |= c << 8; if ((c = getc(file_in)) == EOF) break; val |= c; table[tx++] = val; if(tx > max_entry) { if(!overflow_ok) { fprintf(stderr, "in get_table: table overflow\n"); exit(2); } break; } } fclose(file_in); return(tx); } get_index_type(s) char *s; { FILE *fp = fopen(s, "r"); char buf[32]; int num; if (fp == NULL) return 0; fscanf(fp, "%s\n%%%d\n", buf, &num); /* buf is "%%" or "%%1234567890", num is >= 0 */ /* printf("get_index_type(): %s %d\n", buf, num); */ fclose(fp); return num; } /* n is guaranteed to be < MaxNum4bPartition */ int encode4b(n) int n; { if (n=='\0') return MaxNum4bPartition; if (n=='\n') return MaxNum4bPartition+1; return n; } int decode4b(n) int n; { if (n==MaxNum4bPartition) return '\0'; if (n==MaxNum4bPartition+1) return '\n'; return n; } /* n is guaranteed to be < MaxNum8bPartition */ int encode8b(n) int n; { if (n=='\0') return MaxNum8bPartition; if (n=='\n') return MaxNum8bPartition+1; return n; } int decode8b(n) int n; { if (n==MaxNum8bPartition) return '\0'; if (n==MaxNum8bPartition+1) return '\n'; return n; } /* n is guaranteed to be < MaxNum12bPartition */ int encode12b(n) int n; { unsigned char msb, lsb; msb = (n / MaxNum8bPartition); lsb = (n % MaxNum8bPartition); msb = encode4b(msb); lsb = encode8b(lsb); return (msb<<8)|lsb; } int decode12b(n) int n; { unsigned char msb, lsb; msb = ((n&0x00000f00) >> 8); lsb = (n&0x000000ff); msb = decode4b(msb); lsb = decode8b(lsb); return (msb * MaxNum8bPartition) + lsb; } /* n is guaranteed to be < MaxNum16bPartition */ int encode16b(n) int n; { unsigned char msb, lsb; msb = (n / MaxNum8bPartition); lsb = (n % MaxNum8bPartition); msb = encode8b(msb); lsb = encode8b(lsb); return (msb<<8)|lsb; } int decode16b(n) int n; { unsigned char msb, lsb; msb = ((n&0x0000ff00) >> 8); lsb = (n&0x000000ff); msb = decode8b(msb); lsb = decode8b(lsb); return (msb * MaxNum8bPartition) + lsb; } /* n is guaranteed to be < MaxNum24bPartition */ int encode24b(n) int n; { unsigned short msb, lsb; msb = (n / MaxNum16bPartition); lsb = (n % MaxNum16bPartition); msb = encode8b(msb); lsb = encode16b(lsb); return (msb<<16)|lsb; } int decode24b(n) int n; { unsigned short msb, lsb; msb = ((n&0x00ff0000) >> 16); lsb = (n&0x0000ffff); msb = decode8b(msb); lsb = decode16b(lsb); return (msb * MaxNum16bPartition) + lsb; } /* n is guaranteed to be < MaxNum32bPartition */ int encode32b(n) int n; { unsigned short msb, lsb; msb = (n / MaxNum16bPartition); lsb = (n % MaxNum16bPartition); msb = encode16b(msb); lsb = encode16b(lsb); return (msb<<16)|lsb; } int decode32b(n) int n; { unsigned short msb, lsb; msb = ((n&0xffff0000) >> 16); lsb = (n&0x0000ffff); msb = decode16b(msb); lsb = decode16b(lsb); return (msb * MaxNum16bPartition) + lsb; } extern unsigned char dest_index_buf[REAL_INDEX_BUF]; extern unsigned char src_index_buf[REAL_INDEX_BUF]; /* Read offset from srcbuf first so that you can use it with srcbuf=destbuf */ get_block_numbers(srcbuf, destbuf, partfp) unsigned char *srcbuf, *destbuf; FILE *partfp; { int offset, pat_size; /* Does not do caching of blocks seen so far: done in OS hopefully */ offset = (srcbuf[0] << 24) | (srcbuf[1] << 16) | (srcbuf[2] << 8) | (srcbuf[3]); pat_size = decode32b(offset); fseek(partfp, pat_size, 0); fgets(destbuf, REAL_INDEX_BUF, partfp); } /* Merges the index split by save_data_structures into a single index */ merge_splits() { FILE *i_in; FILE *p_in; FILE *i_out; char s[MAX_LINE_LEN]; int j, wordoffset, index; unsigned char c; char indexnumberbuf[256]; int onefileperblock, structuredindex; #if 0 fflush(stdout); printf("BEFORE MERGE_SPLITS:\n"); sprintf(s, "ls -lg .glimpse_*"); system(s); getchar(); #endif /*0*/ sprintf(s, "%s/%s", INDEX_DIR, P_TABLE); if ((p_in = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); exit(3); } sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if ((i_in = fopen(s, "r")) == NULL) { fprintf(stderr, "cannot open for reading: %s\n", s); exit(3); } sprintf(s, "%s/.glimpse_merge.%d", INDEX_DIR, getpid()); if ((i_out = fopen(s, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", s); exit(3); } /* modified the original in glimpse's main.c */ fgets(indexnumberbuf, 256, i_in); fputs(indexnumberbuf, i_out); fscanf(i_in, "%%%d\n", &onefileperblock); fprintf(i_out, "%%%d\n", onefileperblock); fscanf(i_in, "%%%d\n", &structuredindex); if (structuredindex <= 0) structuredindex = 0; fprintf(i_out, "%%%d\n", structuredindex); while (fgets(src_index_buf, REAL_INDEX_BUF, i_in)) { if (structuredindex) { if (structuredindex < MaxNum8bPartition - 1) wordoffset = j = 1; else if (structuredindex < MaxNum16bPartition - 1) wordoffset = j = 2; else wordoffset = j = 4; } else wordoffset = j = 0; while ((j < REAL_INDEX_BUF) && (src_index_buf[j] != WORD_END_MARK) && (src_index_buf[j] != ALL_INDEX_MARK) && (src_index_buf[j] != '\n')) j++; if ((j >= REAL_INDEX_BUF) || (src_index_buf[j] == '\n')) continue; /* else it is WORD_END_MARK or ALL_INDEX_MARK */ if (structuredindex) { /* convert all types of stuff to 4B indices to make merge_in()s easy in build_in.c */ if (structuredindex < MaxNum8bPartition - 1) index = decode8b(src_index_buf[0]); else if (structuredindex < MaxNum16bPartition - 1) index = decode16b((src_index_buf[0] << 8) | src_index_buf[1]); else index = decode32b((src_index_buf[0] << 24) | (src_index_buf[1] << 16) | (src_index_buf[2] << 8) | (src_index_buf[3])); index = encode32b(index); putc((index & 0xff000000) >> 24, i_out); putc((index & 0x00ff0000) >> 16, i_out); putc((index & 0x0000ff00) >> 8, i_out); putc(index & 0x000000ff, i_out); } c = src_index_buf[j+1]; src_index_buf[j+1] = '\0'; fputs(src_index_buf+wordoffset, i_out); src_index_buf[j+1] = c; if (src_index_buf[j] == ALL_INDEX_MARK) { fputc(DONT_CONFUSE_SORT, i_out); fputc('\n', i_out); continue; } get_block_numbers(&src_index_buf[j+1], dest_index_buf, p_in); j=0; /* first byte of the block numbers */ while(dest_index_buf[j] != '\n') fputc(dest_index_buf[j++], i_out); if (fputc('\n', i_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } } fclose(i_in); fclose(p_in); fflush(i_out); fclose(i_out); sprintf(s, "mv %s/.glimpse_merge.%d %s/%s", INDEX_DIR, getpid(), INDEX_DIR, INDEX_FILE); system(s); #if 0 fflush(stdout); printf("AFTER MERGE_SPLITS:\n"); sprintf(s, "ls -lg .glimpse_*"); system(s); getchar(); #endif /*0*/ } /* * converts file-names with *,. and ? and converts it to # \. and ? ALL OTHER agrep-special characters are masked off. * if the filename NOT a regular expression involving ? or *, it leaves the name untouched and returns the string * length of the file name (so that we can avoid memagrep calls): otherwise, it returns the -ve strlength of the name * after performing the above conversion: hence we never need to call agrep if the length is +ve. */ convert2agrepregexp(buf, len) char *buf; int len; { char tbuf[MAX_PAT]; int i=0, j=0; /* Ignore '*' at the beginning and '*' at the end */ if (len < 1) return 0; if ( ((len == 1) && (buf[len-1] == '*')) || ((len >= 2) && (buf[len-1] == '*') && (buf[len-1] != '\\')) ) { buf[len-1] = '\0'; len--; } if (buf[0] == '*') { for (i=0; i<len; i++) buf[i] = buf[i+1]; len--; } if (len < 1) { buf[0] = '.'; buf[1] = '*'; buf[2] = '\0'; return -2; } for (i=0; i<len; i++) if (buf[i] == '\\') i++; else if ((buf[i] == '?') || (buf[i] == '*')) break; if (i >= len) return len; i = j = 0; while ((i<len) && (j<MAX_PAT) && (buf[i] != '\0')) { /* Consider all special characters interpreted by agrep */ if (buf[i] == '\\') { /* copy two things without interpreting them */ tbuf[j++] = buf[i++]; tbuf[j++] = buf[i++]; } else if ((buf[i] == '-') || (buf[i] == ',') || (buf[i] == ';')|| (buf[i] == '.') || (buf[i] == '#') || (buf[i] == '|')|| (buf[i] == '[') || (buf[i] == ']') || (buf[i] == '(')|| (buf[i] == ')') || (buf[i] == '>') || (buf[i] == '<')|| (buf[i] == '^') || (buf[i] == '$') || (buf[i] == '+')|| (buf[i] == '{') || (buf[i] == '}') || (buf[i] == '~')){ tbuf[j++] = '\\'; tbuf[j++] = buf[i]; i++; } /* Interpret ONLY ? and * in file-names */ else if (buf[i] == '?') { tbuf[j++] = '.'; i++; } else if (buf[i] == '*') { tbuf[j++] = '.'; tbuf[j++] = '*'; i++; } else tbuf[j++] = buf[i++]; } if (j >= MAX_PAT) { tbuf[j-1] = '\0'; fprintf(stderr, "glimpseindex: pattern '%s' too long\n", buf); j--; } else { tbuf[j] = '\0'; } strcpy(buf, tbuf); #if 0 printf("%s=%d\n", buf, j); #endif /*0*/ return -j; /* strlen-compatible, -ve to indicate memagrep must be called */ } extern int num_filter; extern int filter_len[MAX_FILTER]; extern CHAR *filter[MAX_FILTER]; extern CHAR *filter_command[MAX_FILTER]; read_filters(index_dir, dofilter) char *index_dir; int dofilter; { int len; int patlen; int patpos; int commandpos; FILE *filterfile; char filterbuf[MAX_LINE_LEN]; char tempbuf[MAX_LINE_LEN]; char s[MAX_LINE_LEN]; num_filter = 0; memset(filter, '\0', sizeof(CHAR *) * MAX_FILTER); memset(filter_command, '\0', sizeof(CHAR *) * MAX_FILTER); memset(filter_len, '\0', sizeof(int) * MAX_FILTER); if (!dofilter) return; sprintf(s, "%s/%s", index_dir, FILTER_FILE); filterfile = fopen(s, "r"); if(filterfile == NULL) { /* fprintf(stderr, "can't open filter file %s\n", s); -- no need */ num_filter = 0; } else { while((num_filter < MAX_FILTER) && fgets(filterbuf, MAX_LINE_LEN, filterfile)) { if ((len = strlen(filterbuf)) < 1) continue; filterbuf[len-1] = '\0'; commandpos = 0; while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++; /* leading spaces */ if (commandpos >= len) continue; if (filterbuf[commandpos] == '\'') { commandpos ++; patpos = commandpos; patlen = 0; while (commandpos < len) { if (filterbuf[commandpos] == '\\') { commandpos += 2; patlen += 2; } else if (filterbuf[commandpos] != '\'') { commandpos ++; patlen ++; } else break; } if ((commandpos >= len) || (patlen <= 0)) continue; commandpos ++; } else { patpos = commandpos; patlen = 0; while ((commandpos < len) && (filterbuf[commandpos] != ' ') && (filterbuf[commandpos] != '\t')) { commandpos ++; patlen ++; } while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++; if (commandpos >= len) continue; } memcpy(tempbuf, &filterbuf[patpos], patlen); tempbuf[patlen] = '\0'; if ((filter_len[num_filter] = convert2agrepregexp(tempbuf, patlen)) == 0) continue; /* inplace conversion */ filter[num_filter] = (unsigned char *) strdup(tempbuf); filter_command[num_filter] = (unsigned char *)strdup(&filterbuf[commandpos]); num_filter ++; } fclose(filterfile); } } /* 1 if filter application was successful and the output (>1B) is in outname, 2 if some pattern matched but there is no output, 0 otherwise: sep 15-18 '94 */ /* memagrep is initialized in partition.c for calls from dir.c, and it is already done by the time we call this function from main.c */ apply_filter(inname, outname) char *inname, *outname; /* outname is in-out, inname is in */ { int i; char name[MAX_LINE_LEN]; int name_len = strlen(inname); char s[MAX_LINE_LEN]; FILE *dummyout; FILE *dummyin; char dummybuf[4]; char prevoutname[MAX_LINE_LEN]; char newoutname[MAX_LINE_LEN]; int ret = 0; int unlink_prevoutname = 0; if (num_filter <= 0) return 0; if ((dummyout = fopen("/dev/null", "w")) == NULL) return 0; /* ready for memgrep */ name[0] = '\n'; strcpy(name+1, inname); strcpy(prevoutname, inname); strcpy(newoutname, outname); /* Current properly filtered output is always in prevoutname */ for(i=0; i<num_filter; i++) { if (filter_len[i] > 0) { char *suffix; name[name_len + 1] = '\0'; /* if (strstr(name+1, filter[i]) != NULL) { Chris Dalton */ if ((suffix = strstr(name+1, filter[i])) != NULL) { if (ret == 0) ret = 2; /* yes, it matched: now apply the command and get the output */ /* printf("filtering %s\n", name); */ sprintf(s, "%s '%s' > %s", filter_command[i], prevoutname, newoutname); system(s); if (((dummyin = fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) { if (dummyin != NULL) fclose(dummyin); unlink(newoutname); continue; } /* Filter was successful: output exists and has atleast 1 byte in it */ fclose(dummyin); if (unlink_prevoutname) unlink(prevoutname); strcpy(prevoutname, newoutname); sprintf(newoutname, "%s.o", prevoutname); ret = 1; unlink_prevoutname = 1; #if 1 /* if the matched text was a proper suffix of the name, */ /* remove the suffix just processed before examining the */ /* name again. Chris Dalton */ /* And I don't know what the equivalent thing is with */ /* memagrep_search: since it doesn't return a pointer to */ /* the place where the match occured. Burra Gopal */ if (strcmp(filter[i], suffix) == 0) { name_len -= strlen(suffix); *suffix= '\0'; } #endif /*1*/ if (strlen(newoutname) >= MAX_LINE_LEN - 1) break; } } else { /* must call memagrep */ name[name_len + 1] = '\n'; /* memagrep wants names to end with '\n': '\0' is not necessary */ /* printf("i=%d filterlen=%d filter=%s inlen=%d input=%s\n", i, -filter_len[i], filter[i], len_current_dir_buf, current_dir_buf); */ if (((filter_len[i] == -2) && (filter[i][0] == '.') && (filter[i][1] == '*')) || (memagrep_search(-filter_len[i], filter[i], name_len + 2, name, 0, dummyout) > 0)) { if (ret == 0) ret = 2; /* yes, it matched: now apply the command and get the output */ /* printf("filtering %s\n", name); */ sprintf(s, "%s '%s' > %s", filter_command[i], prevoutname, newoutname); system(s); if (((dummyin = fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) { if (dummyin != NULL) fclose(dummyin); unlink(newoutname); continue; } /* Filter was successful: output exists and has atleast 1 byte in it */ fclose(dummyin); if (unlink_prevoutname) unlink(prevoutname); strcpy(prevoutname, newoutname); sprintf(newoutname, "%s.o", prevoutname); ret = 1; unlink_prevoutname = 1; if (strlen(newoutname) >= MAX_LINE_LEN - 1) break; } } } if (ret == 1) strcpy(outname, prevoutname); else { /* dummy filter that copies input to output: caller can use inname but this has easy interface */ sprintf(s, "cat %s > %s\n", inname, outname); system(s); } fclose(dummyout); return ret; } /* Use a modified wais stoplist to do this with simple strcmp's in a for loop */ static_stop_list(word) char *word; { return 0; } /* crazy hash function that operates on 4K hashtables: bug fixes by Chris Dalton */ hash4k(word, len) char *word; int len; { unsigned int hash_value=0; unsigned int mask_3=07; unsigned int mask_12=07777; int i; #if 0 /* discard prefix = the directory name */ if (len<=1) return 0; i = len-1; while(word[i] != '/') i--; if ((i > 0) && (word[i] == '/')) { word = &word[i+1]; len = strlen(word); } #endif /*0*/ if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } for(i=4; i<len; i++) hash_value = mask_12 & (hash_value + word[i]); } return(hash_value & mask_12); }